library(tidyverse)
library(knitr)
library(plotly) ; library(viridis) ; library(gridExtra) ; library(RColorBrewer) ; library(ggpubr)
library(biomaRt)
library(polycor)
library(caret) ; library(ROCR) ; library(car) ; library(MLmetrics)
library(corrplot)
library(expss) ; library(knitr) ; library(kableExtra)
library(foreach) ; library(doParallel)

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}


Load Dataset


clustering_selected = 'DynamicHybrid'
clusterings = read_csv('./../Data/clusters.csv')
clusterings$Module = clusterings[,clustering_selected] %>% data.frame %>% unlist %>% unname
assigned_module = clusterings %>% dplyr::select(ID, Module)

# Update gene scores to new SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_08-29-2019_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]


# Dataset created with 20_04_07_create_dataset.html (change SFARI scores to Old SFARI scores)
dataset = read.csv(paste0('./../Data/dataset_', clustering_selected, '.csv'), row.names=1)
rownames_dataset = rownames(dataset)
dataset = dataset %>% dplyr::select(-gene.score) %>% mutate(ID = rownames_dataset) %>%
          left_join(SFARI_genes %>% dplyr::select(ID, `gene-score`), by = 'ID') %>%
          mutate(Module = clusterings$Module, 
                 gene.score = ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
          dplyr::select(-matches(clustering_selected), -ID, -`gene-score`)
rownames(dataset) = rownames_dataset


# Fix any Gene Significance that is NA
GS_missing = rownames(dataset)[is.na(dataset$GS)]
if(length(GS_missing)>0){
  # Gandal dataset
  load('./../Data/preprocessed_data.RData')
  datExpr = datExpr %>% data.frame
  
  for(g in GS_missing){
    dataset$GS[rownames(dataset) == g] = polyserial(as.numeric(datExpr[g,]), datMeta$Diagnosis)
  }
  rm(datExpr, datGenes, datMeta, dds, DE_info, g, GS_missing)
}


# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)


# Add gene symbol
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(dataset), mart=mart)


rm(getinfo, mart, rownames_dataset, GO_annotations)


The pipeline to prepare the dataset is the same as in 10_classification_model.html, so I won’t repeat the explanation here

rm_cluster = dataset[is.na(dataset$MTcor),'Module'] %>% unique %>% as.character

new_dataset = dataset %>% filter(Module != 'gray' & !is.na(MTcor)) %>% 
              dplyr::select(-c(matches(paste('pval|Module')), MMgray)) %>%
              mutate('absGS'=abs(GS), 'SFARI'=gene.score!='None') %>% dplyr::select(-gene.score)
rownames(new_dataset) = rownames(dataset)[dataset$Module != 'gray']

original_dataset = dataset
dataset = new_dataset

rm(rm_cluster, new_dataset)

The final dataset contains 15994 observations (genes) and 59 variables



Exploratory Analysis


PCA of Variables


The Module Membership variables are grouped by Module-Trait correlation, with positive correlations on one side, negative on the other, and both SFARI and absGS are in the middle of both groups

The position of the SFARI label almost doesn’t change

pca = dataset %>% mutate(SFARI = as.numeric(SFARI)) %>% t %>% prcomp

plot_data = data.frame('ID'=colnames(dataset), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2],
                       type=ifelse(grepl('MM', colnames(dataset)),'ModMembership',
                            ifelse(grepl('SFARI', colnames(dataset)), 'SFARI',
                            ifelse(grepl('GS', colnames(dataset)), 'GS', 'MTcor'))))


mtcor_by_module = original_dataset %>% dplyr::select(Module, MTcor) %>% unique
colnames(mtcor_by_module) = c('ID','MTcor')

plot_data = mtcor_by_module %>% mutate(ID = gsub('#','MM.',ID)) %>% right_join(plot_data, by='ID')

ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=MTcor)) + geom_point(aes(id=ID)) +
         xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
         ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
         scale_colour_distiller(palette = 'RdBu', na.value = 'darkgrey') + theme_minimal() +
         ggtitle('PCA of variables coloured by Module-Diagnosis correlation'))
rm(mtcor_by_module, pca)


PCA of Samples


  • SFARI Genes seem to be evenly distributed everywhere
# Mean Expression data
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
mean_expr = data.frame('ID'=rownames(datExpr), 'meanExpr' = rowMeans(datExpr))

# PCA
pca = dataset %>% t %>% prcomp

plot_data = data.frame('ID'=rownames(dataset), 'PC1'=pca$rotation[,1], 'PC2'=pca$rotation[,2], 
                       'SFARI'=dataset$SFARI, 'MTcor'=dataset$MTcor, 'GS'=dataset$GS) %>%
            mutate(alpha=ifelse(SFARI, 0.7, 0.2)) %>% left_join(mean_expr, by='ID')

p = plot_data %>% ggplot(aes(PC1, PC2, color=SFARI)) + geom_point(alpha = plot_data$alpha) +
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
     theme_minimal() + ggtitle('Genes coloured by SFARI label') + theme(legend.position='bottom')

ggExtra::ggMarginal(p, type='density', groupColour=TRUE, size=10)

rm(pca, datExpr, datGenes, datMeta, dds, DE_info, mean_expr, p)



Dividing samples into Training and Test Sets


Using the same method explained in 10_classification_model.html

The only difference is that we now have a slightly larger proportion of positive observations than before (5.6%)

table_info = dataset %>% apply_labels(SFARI = 'SFARI')

cro(table_info$SFARI)
 #Total 
 SFARI 
   FALSE  15099
   TRUE  895
   #Total cases  15994
rm(table_info)
set.seed(123)

sample_scores = dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID) %>% 
                left_join(original_dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, gene.score), 
                          by = 'ID') %>% 
                mutate(gene.score = ifelse(is.na(gene.score), 'None', gene.score))

train_idx = createDataPartition(sample_scores$gene.score, p = 0.7, list = FALSE)
train_set = dataset[train_idx,]
test_set = dataset[-train_idx,]


rm(sample_scores, train_idx)


Logistic Regression


Train model

# https://shiring.github.io/machine_learning/2017/04/02/unbalanced
# https://topepo.github.io/caret/using-your-own-model-in-train.html#Illustration5


train_set = train_set %>% mutate(SFARI = ifelse(SFARI==TRUE, 'SFARI', 'not_SFARI') %>% as.factor)

smote_over_sampling = trainControl(method = 'repeatedcv', number = 10, repeats = 10, verboseIter = FALSE, 
                                   classProbs = TRUE, savePredictions = 'final', 
                                   summaryFunction = twoClassSummary, sampling = 'smote')

# Using ROC as metric because it doesn't depend on the threshold
fit = caret::train(SFARI ~ ., data = train_set, method = 'glm', family = 'binomial', metric = 'ROC',
                   trControl = smote_over_sampling)

#a = caret::thresholder(fit, threshold = seq(0.05, 0.95, by = 0.1), statistics = 'all')

rm(smote_over_sampling)


Performance


The model has an AUC of 0.6664, but, as before, the features are strongly correlated

# VIF
plot_data = data.frame('Feature' = car::vif(fit$finalModel) %>% sort %>% names,
                       'VIF' = car::vif(fit$finalModel) %>% sort %>% unname) %>%
            mutate(outlier = VIF>10)

plot_data %>% ggplot(aes(reorder(Feature, -VIF), VIF, fill = !outlier)) + geom_bar(stat='identity') + 
              scale_y_log10() + geom_hline(yintercept = 10, color = 'gray', linetype = 'dashed') + 
              xlab('Model Features') + ggtitle('Variance Inflation Number for each Feature') + theme_minimal() +
              theme(legend.position = 'none', axis.text.x = element_text(angle = 90, hjust = 1))

rm(plot_data)


Possible solutions to Multicollinearity:


  1. Remove all variables with a VIF>10: We would lose all but two of our variables, not ideal

  2. Do Principal Component Regression: We would lose the relation between the prediction and the original features, which could be interesting to study

  3. Don’t do anything: Multicollinearity affects the coefficients and p-values of the regression, but it doesn’t affect the predictions, precision of the predictions or the goodness-of-fit statistics ref, but as with the previous option, we cannot study the coefficients of the regression

  4. Use Ridge Regression: The penalty it gives to high coefficients reduces the variance introduced by the correlation, making the coefficients interpretable again




Ridge Regression


Notes:

### DEFINE FUNCTIONS

create_train_test_sets = function(p, seed){
  
  # Get SFARI Score of all the samples so our train and test sets are balanced for each score
  sample_scores = dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID) %>%
                  left_join(original_dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, gene.score), 
                             by = 'ID') %>% 
                  mutate(gene.score = ifelse(is.na(gene.score), 'None', gene.score))

  set.seed(seed)
  train_idx = createDataPartition(sample_scores$gene.score, p = p, list = FALSE)
  
  train_set = dataset[train_idx,]
  test_set = dataset[-train_idx,]
  
  return(list('train_set' = train_set, 'test_set' = test_set))
}



run_model = function(p, over_sampling_fold, seed){
  
  # Create train and test sets
  train_test_sets = create_train_test_sets(p, seed)
  train_set = train_test_sets[['train_set']]
  test_set = train_test_sets[['test_set']]
  
  # Train Model
  train_set = train_set %>% mutate(SFARI = ifelse(SFARI==TRUE, 'SFARI', 'not_SFARI') %>% as.factor)
  lambda_seq = 10^seq(1, -4, by = -.1)
  set.seed(seed)
  smote_over_sampling = trainControl(method = 'repeatedcv', number = over_sampling_fold, repeats = 10,
                                     verboseIter = FALSE, classProbs = TRUE, savePredictions = 'final', 
                                     summaryFunction = twoClassSummary, sampling = 'smote')
  fit = train(SFARI ~., data = train_set, method = 'glmnet', trControl = smote_over_sampling, metric = 'ROC',
              tuneGrid = expand.grid(alpha = 0, lambda = lambda_seq))
  
  # Predict labels in test set
  predictions = fit %>% predict(test_set, type = 'prob')
  preds = data.frame('ID' = rownames(test_set), 'prob' = predictions$SFARI) %>% mutate(pred = prob>0.5)

  # Measure performance of the model
  acc = mean(test_set$SFARI==preds$pred)
  prec = Precision(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
  rec = Recall(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
  F1 = F1_Score(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
  pred_ROCR = prediction(preds$prob, test_set$SFARI)
  AUC = performance(pred_ROCR, measure='auc')@y.values[[1]]
  
  # Extract coefficients from features
  coefs = coef(fit$finalModel, fit$bestTune$lambda) %>% as.vector
  
  return(list('acc' = acc, 'prec' = prec, 'rec' = rec, 'F1' = F1, 
              'AUC' = AUC, 'preds' = preds, 'coefs' = coefs))
}


### RUN MODEL

# Parameters
p = 0.75
over_sampling_fold = 10

n_iter = 25
seeds = 123:(123+n_iter-1)

# Store outputs
acc = c()
prec = c()
rec = c()
F1 = c()
AUC = c()
predictions = data.frame('ID' = rownames(dataset), 'SFARI' = dataset$SFARI, 'prob' = 0, 'pred' = 0, 'n' = 0)
coefs = data.frame('var' = c('Intercept', colnames(dataset[,-ncol(dataset)])), 'coef' = 0)

for(seed in seeds){
  
  # Run model
  model_output = run_model(p, over_sampling_fold, seed)
  
  # Update outputs
  acc = c(acc, model_output[['acc']])
  prec = c(prec, model_output[['prec']])
  rec = c(rec, model_output[['rec']])
  F1 = c(F1, model_output[['F1']])
  AUC = c(AUC, model_output[['AUC']])
  preds = model_output[['preds']]
  coefs$coef = coefs$coef + model_output[['coefs']]
  update_preds = preds %>% dplyr::select(-ID) %>% mutate(n=1)
  predictions[predictions$ID %in% preds$ID, c('prob','pred','n')] = predictions[predictions$ID %in% 
                                                                      preds$ID, c('prob','pred','n')] +
                                                                    update_preds
}

coefs = coefs %>% mutate(coef = coef/n_iter)
predictions = predictions %>% mutate(prob = prob/n, pred_count = pred, pred = prob>0.5)


rm(p, over_sampling_fold, seeds, update_preds, create_train_test_sets, run_model)


To summarise in a single value the predictions of the models:

test_set = predictions %>% filter(n>0) %>% 
           left_join(dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, GS, MTcor), by = 'ID')
rownames(test_set) = predictions$ID[predictions$n>0]


Performance metrics


Confusion matrix

conf_mat = test_set %>% apply_labels(SFARI = 'Actual Labels', 
                                     prob = 'Assigned Probability', 
                                     pred = 'Label Prediction')

cro(conf_mat$SFARI, list(conf_mat$pred, total()))
 Label Prediction     #Total 
 FALSE   TRUE   
 Actual Labels 
   FALSE  11905 3181   15086
   TRUE  484 411   895
   #Total cases  12389 3592   15981
rm(conf_mat)


Accuracy: Mean = 0.7691 SD = 0.0072


Precision: Mean = 0.1151 SD = 0.008


Recall: Mean = 0.4742 SD = 0.0359


F1 score: Mean = 0.1851 SD = 0.0128


ROC Curve: Mean = 0.6791 SD = 0.0185

pred_ROCR = prediction(test_set$prob, test_set$SFARI)

roc_ROCR = performance(pred_ROCR, measure='tpr', x.measure='fpr')
auc = performance(pred_ROCR, measure='auc')@y.values[[1]]

plot(roc_ROCR, main=paste0('ROC curve (AUC=',round(auc,2),')'), col='#009999')
abline(a=0, b=1, col='#666666')


Lift Curve

lift_ROCR = performance(pred_ROCR, measure='lift', x.measure='rpp')
plot(lift_ROCR, main='Lift curve', col='#86b300')

rm(pred_ROCR, roc_ROCR, AUC, lift_ROCR)




Coefficients


MTcor, GS and absGS have a very small coefficient

gene_corr_info = dataset %>% mutate('ID' = rownames(dataset)) %>% dplyr::select(ID, MTcor, SFARI) %>% 
                 left_join(assigned_module, by ='ID') %>% mutate(Module = gsub('#','',Module))

coef_info = coefs %>% mutate('feature' = gsub('MM.','',var)) %>% 
            left_join(gene_corr_info, by = c('feature' = 'Module')) %>% 
            dplyr::select(feature, coef, MTcor, SFARI) %>% group_by(feature, coef, MTcor) %>% 
            summarise('SFARI_perc' = mean(SFARI)) %>% arrange(desc(coef))

coef_info %>% dplyr::select(feature, coef) %>% filter(feature %in% c('Intercept','GS','absGS','MTcor')) %>%
              dplyr::rename('Feature' = feature, 'Coefficient' = coef) %>% 
              kable(align = 'cc', caption = 'Regression Coefficients') %>% kable_styling(full_width = F)
Regression Coefficients
Feature Coefficient
MTcor 0.0001566
absGS -0.0372042
GS -0.1208186
Intercept -0.5215226


  • There is a positive relation between the coefficient assigned to the membership of each module and the enrichment (using ORA) in SFARI genes that are assigned to that module
load('./../Data/ORA.RData')

enrichment_SFARI_info = data.frame('Module'=as.character(), 'SFARI_enrichment'=as.numeric())
for(m in names(enrichment_SFARI)){
  m_info = enrichment_old_SFARI[[m]]
  enrichment = 1-ifelse('SFARI' %in% m_info$ID, m_info$pvalue[m_info$ID=='SFARI'],1)
  enrichment_SFARI_info = enrichment_SFARI_info %>% 
                          add_row(Module = gsub('#','',m), SFARI_enrichment = enrichment)
}

plot_data = coef_info %>% dplyr::rename('Module' = feature) %>% 
            left_join(enrichment_SFARI_info, by = 'Module') %>% filter(!is.na(MTcor))

ggplotly(plot_data %>% ggplot(aes(coef, SFARI_enrichment)) + 
         geom_smooth(method = 'lm', color = 'gray', alpha = 0.1) + 
         geom_point(aes(id = Module), color = paste0('#',plot_data$Module), alpha=0.7) + 
         theme_minimal() + xlab('Coefficient') + 
         ylab('SFARI Genes Enrichment'))
rm(enrichment_SFARI, enrichment_DGN, enrichment_DO, enrichment_GO, enrichment_KEGG, enrichment_Reactome,
   m, m_info, enrichment)


  • There doesn’t seem to be a relation between the coefficient and the correlation of the module and the diagnosis.

This is not a surprise since we knew that there was a negative relation between SFARI genes and Module-Diagnosis correlation from Preprocessing/Gandal/AllRegions/RMarkdowns/20_04_03_WGCNA_modules_EA.html. The fact that there is no relation between coefficient and Module-Diagnosis correlation could even be a good sign that the model is picking some biological signal as well as the SFARI patterns (since the relation with the biological signals is positive)

ggplotly(coef_info %>% dplyr::rename('Module' = feature) %>% filter(!is.na(MTcor)) %>%
              ggplot(aes(coef, MTcor)) +  geom_smooth(method = 'lm', color = 'gray', alpha = 0.1) + 
              geom_point(aes(id = Module), color = paste0('#',coef_info$feature[!is.na(coef_info$MTcor)]), 
                         alpha=0.7) + 
              theme_minimal() + xlab('Coefficient') + 
              ylab('Module-Diagnosis correlation'))




Analyse Results


Note: All of the conclusions from this section are the same found with the new SFARI Genes in 10_classification_model.html

Score distribution by SFARI Label


SFARI genes have a higher score distribution than the rest, but the overlap is large

plot_data = test_set %>% dplyr::select(prob, SFARI)

ggplotly(plot_data %>% ggplot(aes(prob, fill=SFARI, color=SFARI)) + geom_density(alpha=0.3) + xlab('Score') +
         geom_vline(xintercept = mean(plot_data$prob[plot_data$SFARI]), color = '#00C0C2', linetype='dashed') +
         geom_vline(xintercept = mean(plot_data$prob[!plot_data$SFARI]), color = '#FF7371', linetype='dashed') +
         theme_minimal() + ggtitle('Model score distribution by SFARI Label'))


Score distribution by SFARI Score


Even though we didn’t use the actual SFARI Scores to train the model, but instead we grouped them all together, there seems to be positive relation between the SFARI scores and the probability assigned by the model. It’s not significant for adjacent scores, but it usually is for more distant scores

plot_data = test_set %>% mutate(ID=rownames(test_set)) %>% dplyr::select(ID, prob) %>%
            left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID') %>%
            mutate(gene.score = ifelse(gene.score=='None', ifelse(ID %in% GO_neuronal$ID, 'Neuronal', 'Others'), 
                                       gene.score)) %>%
            dplyr::select(ID, prob, gene.score) %>% apply_labels(gene.score='SFARI Gene score')

cro(plot_data$gene.score)
 #Total 
 SFARI Gene score 
   1  25
   2  65
   3  191
   4  427
   5  164
   6  23
   Neuronal  906
   Others  14180
   #Total cases  15981
mean_vals = plot_data %>% group_by(gene.score) %>% summarise(mean_prob = mean(prob))

comparisons = list(c('1','2'), c('2','3'), c('3','4'), c('4','5'), c('5','6'), c('6','Neuronal'), c('Neuronal','Others'),
                   c('1','3'), c('3','5'), c('5','Neuronal'), 
                   c('2','4'), c('4','6'), c('6','Others'),
                   c('1','4'), c('4','Neuronal'),
                   c('2','5'), c('5','Others'),
                   c('3','Neuronal'), c('1','5'), c('2','6'), c('3','Neuronal'), c('4','Others'))
increase = 0.08
base = 0.9
pos_y_comparisons = c(rep(base, 7), rep(base + increase,3), rep(base + 2*increase,3), rep(base + 3*increase, 2),
                      rep(base + 4*increase, 2), base + 5:9*increase)

plot_data %>% ggplot(aes(gene.score, prob, fill=gene.score)) + 
              geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3) +
              stat_compare_means(comparisons = comparisons, label = 'p.signif', method = 't.test', 
                                 method.args = list(var.equal = FALSE), label.y = pos_y_comparisons, 
                                 tip.length = .02) +
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + 
              ggtitle('Distribution of probabilities by SFARI score') +
              xlab('SFARI score') + ylab('Probability') + theme_minimal() + theme(legend.position = 'none')

rm(mean_vals, increase, base, pos_y_comparisons)


Genes with the highest Probabilities


  • Considering the class imbalance in the test set (1:19), there are many more SFARI scores in here (1:3)

  • High concentration of genes with a SFARI Score of 1, specially at the top

test_set %>% dplyr::select(prob, SFARI) %>% mutate(ID = rownames(test_set)) %>% 
             arrange(desc(prob)) %>% top_n(50, wt=prob) %>%
             left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID')  %>%
             mutate(gene.score = ifelse(gene.score=='None', ifelse(ID %in% GO_neuronal$ID, 'Neuronal', 'Others'), 
                                        gene.score)) %>%
             left_join(gene_names, by = c('ID'='ensembl_gene_id')) %>%
             dplyr::rename('GeneSymbol' = external_gene_id, 'Probability' = prob, 'ModuleDiagnosis_corr' =MTcor,
                           'GeneSignificance' = GS) %>%
             mutate(ModuleDiagnosis_corr = round(ModuleDiagnosis_corr,4), Probability = round(Probability,4), 
                    GeneSignificance = round(GeneSignificance,4)) %>%
             dplyr::select(GeneSymbol, GeneSignificance, ModuleDiagnosis_corr, Module, Probability,
                           gene.score) %>%
             kable(caption = 'Genes with highest model probabilities from the test set') %>% 
             kable_styling(full_width = F)
Genes with highest model probabilities from the test set
GeneSymbol GeneSignificance ModuleDiagnosis_corr Module Probability gene.score
TANC2 -0.2362 -0.5573 #00C0BF 0.8555 3
MYCBP2 -0.3960 -0.3847 #00B7E7 0.8463 Neuronal
RIMBP2 0.2592 0.5341 #EF67EB 0.8432 Others
RPRD2 -0.0931 0.3428 #00C08C 0.8421 Others
MBD5 0.1916 0.0721 #96A900 0.8388 3
ARID1B 0.2675 0.3428 #00C08C 0.8380 1
ANK2 -0.4350 -0.9003 #00BADE 0.8347 1
PRDM2 -0.2516 0.0721 #96A900 0.8341 Others
KCNJ6 -0.1389 -0.9003 #00BADE 0.8329 Others
HUNK -0.3276 -0.3731 #4BB400 0.8324 Others
HIVEP2 0.0188 -0.9003 #00BADE 0.8294 Others
EP300 -0.0235 0.3428 #00C08C 0.8254 4
RFX7 0.1381 0.0721 #96A900 0.8237 Others
KMT2A 0.1343 0.6175 #44A0FF 0.8188 1
RAPH1 -0.0554 -0.3847 #00B7E7 0.8184 Others
FMNL1 -0.2229 -0.3847 #00B7E7 0.8155 Others
SRRM4 -0.4423 -0.5573 #00C0BF 0.8116 5
ST8SIA3 -0.1994 -0.7233 #FF63B6 0.8113 Others
ZNF804A -0.2756 -0.3731 #4BB400 0.8112 3
GABBR2 -0.6251 -0.9003 #00BADE 0.8095 4
TLN2 -0.4786 -0.7233 #FF63B6 0.8073 Others
C20orf112 0.0294 0.3428 #00C08C 0.8069 Others
ETV6 -0.1439 -0.6508 #E76BF3 0.8060 Others
RASGRF1 -0.7424 -0.9003 #00BADE 0.8059 Neuronal
TLE3 0.4865 0.5341 #EF67EB 0.8043 Others
ANK3 -0.4808 -0.8123 #00BECA 0.8033 3
NBEA -0.4139 -0.9003 #00BADE 0.8012 4
GRIN2B -0.2787 -0.7233 #FF63B6 0.8011 1
CREBBP 0.1424 0.3428 #00C08C 0.7987 5
CACNG3 -0.4698 -0.6770 #00A6FF 0.7987 Others
SIK3 -0.1236 -0.3847 #00B7E7 0.7980 Others
NRXN3 -0.6712 -0.9003 #00BADE 0.7977 3
CNTNAP2 -0.6999 -0.9003 #00BADE 0.7969 2
CELF2 -0.0655 -0.9003 #00BADE 0.7961 Others
AAK1 -0.6910 -0.9003 #00BADE 0.7960 Others
EP400 -0.1699 -0.5573 #00C0BF 0.7958 3
KCNB1 -0.7190 -0.9003 #00BADE 0.7957 Neuronal
SCAF4 0.0161 -0.5573 #00C0BF 0.7921 Others
RNF111 -0.2393 0.0721 #96A900 0.7915 Others
PLXNC1 -0.0093 0.5341 #EF67EB 0.7913 Others
SRCAP -0.3631 -0.5573 #00C0BF 0.7912 2
BAZ2A 0.0518 0.0721 #96A900 0.7908 Others
GNAS -0.5848 -0.5573 #00C0BF 0.7891 4
PCDHB13 -0.0405 -0.4661 #1FB700 0.7891 Others
GLTSCR1L -0.1611 0.0721 #96A900 0.7887 Others
POM121C -0.2811 -0.5573 #00C0BF 0.7886 Others
KMT2D -0.3274 -0.5573 #00C0BF 0.7883 Others
NFAT5 0.1025 0.0721 #96A900 0.7874 Others
AJAP1 -0.0441 -0.0245 #FE6D8D 0.7868 Others
DAAM1 0.1540 -0.0127 #00BF7D 0.7863 Others





Negative samples distribution


Note: All of the conclusions from this section are the same found with the new SFARI Genes in 10_classification_model.html

The objective of this model is to identify candidate SFARI genes. For this, we are going to focus on the negative samples (the non-SFARI genes)

negative_set = test_set %>% filter(!SFARI)

negative_set_table = negative_set %>% apply_labels(prob = 'Assigned Probability', 
                                                   pred = 'Label Prediction')

cro(negative_set_table$pred)
 #Total 
 Label Prediction 
   FALSE  11905
   TRUE  3181
   #Total cases  15086

3181 genes are predicted as ASD-related


negative_set %>% ggplot(aes(prob)) + geom_density(color='#F8766D', fill='#F8766D', alpha=0.5) +
                 geom_vline(xintercept=0.5, color='#333333', linetype='dotted') + xlab('Probability') +
                 ggtitle('Probability distribution of the Negative samples in the Test Set') + 
                 theme_minimal()


negative_set %>% dplyr::select(prob, SFARI) %>% mutate(ID = rownames(negative_set)) %>% 
                 arrange(desc(prob)) %>% top_n(50, wt = prob) %>%
                 left_join(original_dataset %>% mutate(ID = rownames(original_dataset)), by = 'ID')  %>%
                 mutate(gene.score = ifelse(gene.score=='None', 
                                            ifelse(ID %in% GO_neuronal$ID,'Neuronal','Others'), 
                                            gene.score)) %>%
                 left_join(gene_names, by = c('ID'='ensembl_gene_id')) %>%
                 dplyr::rename('GeneSymbol' = external_gene_id, 'Probability' = prob, 
                               'ModuleDiagnosis_corr' =MTcor, 'GeneSignificance' = GS) %>%
                 mutate(ModuleDiagnosis_corr = round(ModuleDiagnosis_corr, 4), 
                        Probability = round(Probability, 4), 
                        GeneSignificance = round(GeneSignificance, 4)) %>%
                 dplyr::select(GeneSymbol, GeneSignificance, ModuleDiagnosis_corr, Module, Probability,
                               gene.score) %>%
                 kable(caption = 'Genes with highest model probabilities from the Negative set') %>% 
                 kable_styling(full_width = F)
Genes with highest model probabilities from the Negative set
GeneSymbol GeneSignificance ModuleDiagnosis_corr Module Probability gene.score
MYCBP2 -0.3960 -0.3847 #00B7E7 0.8463 Neuronal
RIMBP2 0.2592 0.5341 #EF67EB 0.8432 Others
RPRD2 -0.0931 0.3428 #00C08C 0.8421 Others
PRDM2 -0.2516 0.0721 #96A900 0.8341 Others
KCNJ6 -0.1389 -0.9003 #00BADE 0.8329 Others
HUNK -0.3276 -0.3731 #4BB400 0.8324 Others
HIVEP2 0.0188 -0.9003 #00BADE 0.8294 Others
RFX7 0.1381 0.0721 #96A900 0.8237 Others
RAPH1 -0.0554 -0.3847 #00B7E7 0.8184 Others
FMNL1 -0.2229 -0.3847 #00B7E7 0.8155 Others
ST8SIA3 -0.1994 -0.7233 #FF63B6 0.8113 Others
TLN2 -0.4786 -0.7233 #FF63B6 0.8073 Others
C20orf112 0.0294 0.3428 #00C08C 0.8069 Others
ETV6 -0.1439 -0.6508 #E76BF3 0.8060 Others
RASGRF1 -0.7424 -0.9003 #00BADE 0.8059 Neuronal
TLE3 0.4865 0.5341 #EF67EB 0.8043 Others
CACNG3 -0.4698 -0.6770 #00A6FF 0.7987 Others
SIK3 -0.1236 -0.3847 #00B7E7 0.7980 Others
CELF2 -0.0655 -0.9003 #00BADE 0.7961 Others
AAK1 -0.6910 -0.9003 #00BADE 0.7960 Others
KCNB1 -0.7190 -0.9003 #00BADE 0.7957 Neuronal
SCAF4 0.0161 -0.5573 #00C0BF 0.7921 Others
RNF111 -0.2393 0.0721 #96A900 0.7915 Others
PLXNC1 -0.0093 0.5341 #EF67EB 0.7913 Others
BAZ2A 0.0518 0.0721 #96A900 0.7908 Others
PCDHB13 -0.0405 -0.4661 #1FB700 0.7891 Others
GLTSCR1L -0.1611 0.0721 #96A900 0.7887 Others
POM121C -0.2811 -0.5573 #00C0BF 0.7886 Others
KMT2D -0.3274 -0.5573 #00C0BF 0.7883 Others
NFAT5 0.1025 0.0721 #96A900 0.7874 Others
AJAP1 -0.0441 -0.0245 #FE6D8D 0.7868 Others
DAAM1 0.1540 -0.0127 #00BF7D 0.7863 Others
SAP130 -0.2497 -0.5573 #00C0BF 0.7861 Others
TP53INP2 -0.2133 -0.4661 #1FB700 0.7857 Others
TENM3 0.1768 -0.7233 #FF63B6 0.7835 Neuronal
ARHGEF3 -0.0482 -0.9003 #00BADE 0.7825 Others
NMNAT2 -0.6814 -0.9003 #00BADE 0.7819 Others
CDC42EP3 -0.3475 -0.9003 #00BADE 0.7819 Others
NCOA2 0.4695 0.5341 #EF67EB 0.7818 Others
PCLO -0.1874 0.0721 #96A900 0.7812 Others
AMPD3 -0.2812 -0.9003 #00BADE 0.7812 Others
ASAP1 -0.0917 -0.3847 #00B7E7 0.7810 Others
AFAP1 -0.1712 0.3428 #00C08C 0.7798 Others
ZNF592 0.2727 0.3428 #00C08C 0.7794 Others
KIAA1217 -0.1881 -0.9003 #00BADE 0.7785 Others
PIEZO2 -0.0776 -0.6508 #E76BF3 0.7784 Others
DROSHA -0.4103 -0.5573 #00C0BF 0.7775 Others
TET3 0.4848 0.5341 #EF67EB 0.7774 Others
LRRC7 0.1828 -0.9003 #00BADE 0.7770 Others
GATAD2B -0.4236 -0.5573 #00C0BF 0.7757 Others




Probability and Gene Significance


  • There’s a lot of noise, but the probability the model assigns to each gene seems to have a negative relation with the Gene Significance (under-expressed genes having on average the higher probabilities and over-expressed genes the lowest)

  • The pattern is stronger in under-expressed genes

negative_set %>% ggplot(aes(prob, GS, color = MTcor)) + geom_point() + 
                 geom_smooth(method = 'loess', color = '#666666') +
                 geom_hline(yintercept = 0, color='gray', linetype='dashed') + xlab('Probability') +
                 scale_color_gradientn(colours=c('#F8766D','white','#00BFC4')) + 
                 ggtitle('Relation between Probability and Gene Significance') + theme_minimal()




Probability and Module-Diagnosis correlation


  • There’s not a strong relation between the Module-Diagnosis correlation of the genes assigned module and the probability assigned by the model

  • The model seems to assign slightly higher probabilities to genes belonging the modules with negative module-Dianosis correlations than to genes belonging to modules with positive ones

negative_set %>% ggplot(aes(MTcor, prob, color=GS)) + geom_point() + 
                 geom_smooth(method='loess', color='#666666') + 
                 geom_hline(yintercept=mean(negative_set$prob), color='gray', linetype='dashed') +
                 scale_color_gradientn(colours=c('#F8766D','#F8766D','white','#00BFC4','#00BFC4')) + 
                 xlab('Modules ordered by their correlation to ASD') + ylab('Model probability') +
                 theme_minimal()




Summarised version, plotting by module instead of by gene


The difference in the trend lines between this plot and the one above is that the one above takes all the points into consideration while this considers each module as an observation by itself, so the top one is strongly affected by big modules and the bottom one treats all modules the same

The model seems to give lower probabilities to genes belonging to modules with a high (absolute) correlation to Diagnosis (this is unexpected)

plot_data = negative_set %>% group_by(MTcor) %>% summarise(mean = mean(prob), sd = sd(prob), n = n()) %>%
            mutate(MTcor_sign = ifelse(MTcor>0, 'Positive', 'Negative')) %>% 
            left_join(original_dataset, by='MTcor') %>%
            dplyr::select(Module, MTcor, MTcor_sign, mean, sd, n) %>% distinct()
colnames(plot_data)[1] = 'ID'

ggplotly(plot_data %>% ggplot(aes(MTcor, mean, size=n, color=MTcor_sign)) + geom_point(aes(id=ID), alpha=0.7) + 
         geom_smooth(method='loess', color='gray', se=FALSE) + geom_smooth(method='lm', se=FALSE) + 
         xlab('Module-Diagnosis correlation') + ylab('Mean Probability by Model') + theme_minimal())




Probability and level of expression


There is a positive relation between level of expression and probability, the model seems to be capturing indirectly the level of expression of the genes to make the prediction, so it’s introducing the same bias

We had already found some mean expression bias in the SFARI scores, but thought we had eliminated this information when constructing the WGCNA network (because correlation is independent to linear transformations), but a signal related to this seems to have survived into the network and the model is picking it up

# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame
mean_and_sd = data.frame(ID=rownames(datExpr), meanExpr=rowMeans(datExpr), sdExpr=apply(datExpr,1,sd))

plot_data = negative_set %>% left_join(mean_and_sd, by='ID') %>% 
            left_join(original_dataset %>% mutate(ID=rownames(original_dataset)) %>% 
                      dplyr::select(ID, Module), by='ID')
colnames(plot_data)[ncol(plot_data)] = 'Module'

plot_data %>% ggplot(aes(meanExpr, prob)) + geom_point(alpha=0.1, color='#0099cc') + 
              geom_smooth(method='loess', color='gray', alpha=0.3) + xlab('Mean Expression') + 
              ylab('Probability') + ggtitle('Mean expression vs model probability by gene') +
              theme_minimal()

rm(mean_and_sd)
plot_data2 = plot_data %>% group_by(Module) %>% summarise(meanExpr = mean(meanExpr), meanProb = mean(prob), 
                                                          n=n())

ggplotly(plot_data2 %>% ggplot(aes(meanExpr, meanProb, size=n)) + 
         geom_point(color=plot_data2$Module, alpha = 0.6) + 
         geom_smooth(method='loess', se=TRUE, color='gray', alpha=0.1, size=0.7) + 
         xlab('Mean Level of Expression') + ylab('Average Model Probability') +
         theme_minimal() + theme(legend.position='none') + 
         ggtitle('Mean expression vs model probability by Module'))
rm(plot_data2)




Probability and LFC


There is a relation between probability and LFC, so it IS capturing a bit of true information (because LFC and mean expression were negatively correlated and it still has a positive relation in the model)

  • The relation is stronger in genes under-expressed in ASD
plot_data = negative_set %>% left_join(DE_info %>% mutate(ID=rownames(DE_info)), by='ID')

plot_data %>% ggplot(aes(log2FoldChange, prob)) + geom_point(alpha=0.1, color='#0099cc') + 
              geom_smooth(method='loess', color='gray', alpha=0.3) + xlab('LFC') +
              theme_minimal() + ggtitle('LFC vs model probability by gene')

  • The relation is stronger in Differentially Expressed genes
p1 = plot_data %>% filter(log2FoldChange<0) %>% mutate(DE = padj<0.05) %>% 
     ggplot(aes(log2FoldChange, prob, color=DE)) + geom_point(alpha=0.1) + 
     geom_smooth(method='loess', alpha=0.1) + xlab('') + ylab('Probability') + 
     ylim(c(min(plot_data$prob), max(plot_data$prob))) + 
     theme_minimal() + theme(legend.position = 'none', plot.margin=unit(c(1,-0.3,1,1), 'cm'))

p2 = plot_data %>% filter(log2FoldChange>=0) %>% mutate(DE = padj<0.05) %>% 
     ggplot(aes(log2FoldChange, prob, color=DE)) + geom_point(alpha=0.1) + 
     geom_smooth(method='loess', alpha=0.1) + xlab('') + ylab('Probability') + ylab('') +
     scale_y_continuous(position = 'right', limits = c(min(plot_data$prob), max(plot_data$prob))) +
     theme_minimal() + theme(plot.margin = unit(c(1,1,1,-0.3), 'cm'), axis.ticks.y = element_blank())

grid.arrange(p1, p2, nrow=1, top = 'LFC vs model probability by gene', bottom = 'LFC')

rm(p1, p2)



Mean expression bias with old and new SFARI Genes


The bias using both SFARI Gene scoring systems is quite similar

load('./../Data/Ridge_model.RData')

mean_and_sd = data.frame(ID=rownames(datExpr), meanExpr=rowMeans(datExpr), sdExpr=apply(datExpr,1,sd))

plot_data = negative_set %>% left_join(mean_and_sd, by='ID') %>% dplyr::rename('old_prob' = prob) %>%
            inner_join(predictions %>% dplyr::select(ID, prob), by = 'ID') %>%
            left_join(original_dataset %>% mutate(ID=rownames(original_dataset)) %>% 
                      dplyr::select(ID, Module), by='ID')
colnames(plot_data)[ncol(plot_data)] = 'Module'

plot_data %>% ggplot(aes(meanExpr, old_prob)) + geom_point(alpha=0.1, color='#0099cc') + 
              geom_smooth(method='loess', color='#cc3399', fill='#cc3399', alpha=0.15) +
              geom_smooth(method='loess', color='#ff9900', fill='#ff9900', alpha=0.15, aes(y=prob)) + 
              xlab('Mean Expression') + ylab('Probability') + 
              ggtitle('Bias with old SFARI Genes (pink) vs with New SFARI Genes (orange)') +
              theme_minimal()

rm(mean_and_sd)

Conclusion


The model is capturing the mean level of expression of the genes (indirectly through module memberhsip), which is a strong bias found in the SFARI scores, but it seems to be capturing a bit of true biological signal as well (based on the GS and the log fold change plots)

There doesn’t seem to be a big difference between models using the Old SFARI Gene scoring system and the new one


Saving results

predictions = test_set %>% left_join(gene_names, by = c('ID' = 'ensembl_gene_id'))

save(predictions, dataset, fit, file='./../Data/Ridge_model_old_SFARI.RData')




Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] DMwR_0.4.1         doParallel_1.0.15  iterators_1.0.12   foreach_1.5.0     
##  [5] kableExtra_1.1.0   expss_0.10.2       corrplot_0.84      MLmetrics_1.1.1   
##  [9] car_3.0-7          carData_3.0-3      ROCR_1.0-7         gplots_3.0.3      
## [13] caret_6.0-86       lattice_0.20-41    polycor_0.7-10     biomaRt_2.40.5    
## [17] ggpubr_0.2.5       magrittr_1.5       RColorBrewer_1.1-2 gridExtra_2.3     
## [21] viridis_0.5.1      viridisLite_0.3.0  plotly_4.9.2       knitr_1.28        
## [25] forcats_0.5.0      stringr_1.4.0      dplyr_1.0.0        purrr_0.3.4       
## [29] readr_1.3.1        tidyr_1.1.0        tibble_3.0.1       ggplot2_3.3.2     
## [33] tidyverse_1.3.0   
## 
## loaded via a namespace (and not attached):
##   [1] tidyselect_1.1.0            RSQLite_2.2.0              
##   [3] AnnotationDbi_1.46.1        htmlwidgets_1.5.1          
##   [5] BiocParallel_1.18.1         pROC_1.16.2                
##   [7] munsell_0.5.0               codetools_0.2-16           
##   [9] miniUI_0.1.1.1              withr_2.2.0                
##  [11] colorspace_1.4-1            Biobase_2.44.0             
##  [13] highr_0.8                   rstudioapi_0.11            
##  [15] stats4_3.6.3                ggsignif_0.6.0             
##  [17] TTR_0.23-6                  labeling_0.3               
##  [19] GenomeInfoDbData_1.2.1      bit64_0.9-7                
##  [21] farver_2.0.3                vctrs_0.3.1                
##  [23] generics_0.0.2              ipred_0.9-9                
##  [25] xfun_0.12                   R6_2.4.1                   
##  [27] GenomeInfoDb_1.20.0         locfit_1.5-9.4             
##  [29] bitops_1.0-6                DelayedArray_0.10.0        
##  [31] assertthat_0.2.1            promises_1.1.0             
##  [33] scales_1.1.1                nnet_7.3-14                
##  [35] ggExtra_0.9                 gtable_0.3.0               
##  [37] timeDate_3043.102           rlang_0.4.6                
##  [39] genefilter_1.66.0           splines_3.6.3              
##  [41] lazyeval_0.2.2              ModelMetrics_1.2.2.2       
##  [43] acepack_1.4.1               broom_0.5.5                
##  [45] checkmate_2.0.0             yaml_2.2.1                 
##  [47] reshape2_1.4.4              abind_1.4-5                
##  [49] modelr_0.1.6                crosstalk_1.1.0.1          
##  [51] backports_1.1.8             quantmod_0.4.17            
##  [53] httpuv_1.5.2                Hmisc_4.4-0                
##  [55] tools_3.6.3                 lava_1.6.7                 
##  [57] ellipsis_0.3.1              BiocGenerics_0.30.0        
##  [59] Rcpp_1.0.4.6                plyr_1.8.6                 
##  [61] base64enc_0.1-3             progress_1.2.2             
##  [63] zlibbioc_1.30.0             RCurl_1.98-1.2             
##  [65] prettyunits_1.1.1           rpart_4.1-15               
##  [67] zoo_1.8-8                   S4Vectors_0.22.1           
##  [69] SummarizedExperiment_1.14.1 haven_2.2.0                
##  [71] cluster_2.1.0               fs_1.4.0                   
##  [73] data.table_1.12.8           openxlsx_4.1.4             
##  [75] reprex_0.3.0                matrixStats_0.56.0         
##  [77] hms_0.5.3                   mime_0.9                   
##  [79] evaluate_0.14               xtable_1.8-4               
##  [81] XML_3.99-0.3                rio_0.5.16                 
##  [83] jpeg_0.1-8.1                readxl_1.3.1               
##  [85] shape_1.4.4                 IRanges_2.18.3             
##  [87] compiler_3.6.3              KernSmooth_2.23-17         
##  [89] crayon_1.3.4                htmltools_0.4.0            
##  [91] mgcv_1.8-31                 later_1.0.0                
##  [93] Formula_1.2-3               geneplotter_1.62.0         
##  [95] lubridate_1.7.4             DBI_1.1.0                  
##  [97] dbplyr_1.4.2                MASS_7.3-51.6              
##  [99] Matrix_1.2-18               cli_2.0.2                  
## [101] gdata_2.18.0                gower_0.2.1                
## [103] GenomicRanges_1.36.1        pkgconfig_2.0.3            
## [105] foreign_0.8-76              recipes_0.1.10             
## [107] xml2_1.2.5                  annotate_1.62.0            
## [109] webshot_0.5.2               XVector_0.24.0             
## [111] prodlim_2019.11.13          rvest_0.3.5                
## [113] digest_0.6.25               rmarkdown_2.1              
## [115] cellranger_1.1.0            htmlTable_1.13.3           
## [117] curl_4.3                    shiny_1.4.0.2              
## [119] gtools_3.8.2                lifecycle_0.2.0            
## [121] nlme_3.1-147                jsonlite_1.7.0             
## [123] fansi_0.4.1                 pillar_1.4.4               
## [125] fastmap_1.0.1               httr_1.4.1                 
## [127] survival_3.1-12             xts_0.12-0                 
## [129] glue_1.4.1                  zip_2.0.4                  
## [131] png_0.1-7                   glmnet_3.0-2               
## [133] bit_1.1-15.2                class_7.3-17               
## [135] stringi_1.4.6               blob_1.2.1                 
## [137] DESeq2_1.24.0               latticeExtra_0.6-29        
## [139] caTools_1.18.0              memoise_1.1.0